In [63]:
%matplotlib inline

from matplotlib import pyplot as plt
import matplotlib.mlab as mlab
import csv
from scipy.stats import norm
import numpy as np
import scipy.stats as stats
import pandas as pd
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

In [6]:
data = open('../data/data.csv', 'r').readlines()
fieldnames = ['x', 'y', 'z', 'unmasked', 'synapses']
reader = csv.reader(data)
reader.next()

rows = [[int(col) for col in row] for row in reader]

In [7]:
df = pd.DataFrame(rows)

In [8]:
df.columns = ['x', 'y', 'z', 'unmasked', 'synDen']

In [9]:
df = df[df.unmasked != 0]
df = df[df.synDen != 0]

In [10]:
df['synDen'] = df['synDen']/df['unmasked']

In [11]:
synData = df['synDen'].reshape(-1,1)
print synData


[[ 0.00113788]
 [ 0.00021915]
 [ 0.00088586]
 ..., 
 [ 0.00065746]
 [ 0.00043831]
 [ 0.00070205]]

In [12]:
from sklearn.cluster import KMeans

In [21]:
k_range = range(1,26)
k_means_var = [KMeans(n_clusters = k).fit(synData) for k in k_range]
centroids = [X.cluster_centers_ for X in k_means_var]

In [22]:
print k_means_var


[KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=1, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=4, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=5, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=6, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=7, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=8, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=9, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=10, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=11, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=12, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=13, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=14, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=15, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=16, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=17, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=18, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=19, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=20, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=21, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=22, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=23, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=24, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0), KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=25, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)]

In [23]:
from scipy.spatial import distance
def compute_bic(kmeans,X):
    """
    Computes the BIC metric for a given clusters

    Parameters:
    -----------------------------------------
    kmeans:  List of clustering object from scikit learn

    X     :  multidimension np array of data points

    Returns:
    -----------------------------------------
    BIC value
    """
    # assign centers and labels
    centers = [kmeans.cluster_centers_]
    labels  = kmeans.labels_
    #number of clusters
    m = kmeans.n_clusters
    # size of the clusters
    n = np.bincount(labels)
    #size of data set
    N, d = X.shape

    #compute variance for all clusters beforehand
    cl_var = (1.0 / (N - m) / d) * sum([sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2) for i in range(m)])

    const_term = 0.5 * m * np.log(N) * (d+1)

    BIC = np.sum([n[i] * np.log(n[i]) -
               n[i] * np.log(N) -
             ((n[i] * d) / 2) * np.log(2*np.pi*cl_var) -
             ((n[i] - 1) * d/ 2) for i in range(m)]) - const_term

    return(BIC)

In [24]:
BIC = [compute_bic(kmeansi ,synData) for kmeansi in k_means_var]
plt.plot(k_range,BIC,'r-o')
plt.title("cluster vs BIC")
plt.xlabel("# clusters")
plt.ylabel("# BIC")

#Code From: http://www.slideshare.net/SarahGuido/kmeans-clustering-with-scikitlearn
#Code From: http://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans


Out[24]:
<matplotlib.text.Text at 0x10aca37d0>

In [25]:
#optimum number of clusters is 6 looking at the elbow of the graph

In [13]:
km = KMeans(n_clusters = 7)
km.fit(synData)
labels = km.labels_
df['clusterGroup'] = labels

print df


          x     y     z  unmasked    synDen  clusterGroup
5        19  1369   610     14940  0.001138             5
9        19  1369  1054      4563  0.000219             0
16       19  1408   610     14675  0.000886             2
22       19  1447    55      4856  0.000206             0
27       19  1447   610     14396  0.001111             5
38       19  1486   610     14009  0.001499             1
42       19  1486  1054      4563  0.000219             0
49       19  1525   610     13810  0.001231             5
60       19  1564   610     13271  0.000904             2
71       19  1603   610     13073  0.000306             0
77       19  1642    55      7943  0.000252             0
82       19  1642   610     12743  0.000157             0
88       19  1681    55      8490  0.000236             0
93       19  1681   610     11616  0.000172             0
99       19  1720    55      9468  0.000211             0
104      19  1720   610     10081  0.000397             0
110      19  1759    55     10312  0.000776             2
115      19  1759   610     10220  0.000098             0
117      19  1759   832      4563  0.000219             0
121      19  1798    55     10520  0.000951             2
126      19  1798   610     11332  0.000441             0
132      19  1837    55     10801  0.000278             0
137      19  1837   610     10936  0.000457             6
139      19  1837   832      4563  0.000438             0
154      19  1915    55     11418  0.000350             0
165      19  1954    55     12838  0.000779             2
170      19  1954   610     10002  0.000100             0
176      19  1993    55     13681  0.000512             6
181      19  1993   610      9706  0.000309             0
187      19  2032    55     14006  0.000500             6
...     ...   ...   ...       ...       ...           ...
61574  4192  2656   832      9126  0.002301             3
61578  4192  2695    55     13271  0.000301             0
61583  4192  2695   610     10647  0.001409             1
61585  4192  2695   832      9126  0.000986             2
61589  4192  2734    55     11813  0.000169             0
61594  4192  2734   610     10300  0.000583             6
61596  4192  2734   832      9126  0.000110             0
61600  4192  2773    55     10904  0.000183             0
61605  4192  2773   610     10377  0.000096             0
61607  4192  2773   832      9126  0.000767             6
61611  4192  2812    55     10602  0.000189             0
61614  4192  2812   388      4563  0.000877             2
61616  4192  2812   610     10647  0.000282             0
61618  4192  2812   832      9126  0.001534             4
61622  4192  2851    55      9980  0.000802             2
61627  4192  2851   610     10196  0.000392             0
61629  4192  2851   832      9126  0.000767             6
61633  4192  2890    55      9952  0.000402             0
61638  4192  2890   610      9126  0.000767             6
61640  4192  2890   832      9078  0.000551             6
61649  4192  2929   610      9007  0.000333             0
61651  4192  2929   832      7342  0.000953             2
61655  4192  2968    55      8693  0.001035             2
61660  4192  2968   610      8657  0.000693             6
61662  4192  2968   832      9126  0.001315             1
61666  4192  3007    55      4794  0.000834             2
61671  4192  3007   610      6367  0.000314             0
61673  4192  3007   832      9126  0.000657             6
61684  4192  3046   832      9126  0.000438             0
61695  4192  3085   832      7122  0.000702             6

[50180 rows x 6 columns]

In [23]:
fig = plt.figure()
ax = fig.gca(projection='3d')

df0 = df[df.clusterGroup == 0]
df1 = df[df.clusterGroup == 1]
df2 = df[df.clusterGroup == 2]
df3 = df[df.clusterGroup == 3]
df4 = df[df.clusterGroup == 4]
df5 = df[df.clusterGroup == 5]
df6 = df[df.clusterGroup == 6]

ax.scatter(df0['x'],df0['y'],df0['z'], c = 'b')
ax.scatter(df1['x'],df1['y'],df1['z'], c = 'g')
ax.scatter(df2['x'],df2['y'],df2['z'], c = 'r')
ax.scatter(df3['x'],df3['y'],df3['z'], c = 'c')
ax.scatter(df4['x'],df4['y'],df4['z'], c = 'm')
ax.scatter(df5['x'],df5['y'],df5['z'], c = 'y')
ax.scatter(df6['x'],df6['y'],df6['z'], c = 'k')

plt.show()



In [30]:
df0 = df[df.clusterGroup == 0]
df1 = df[df.clusterGroup == 1]
df2 = df[df.clusterGroup == 2]
df3 = df[df.clusterGroup == 3]
df4 = df[df.clusterGroup == 4]
df5 = df[df.clusterGroup == 5]
df6 = df[df.clusterGroup == 6]

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df0['x'],df0['y'],df0['z'], c = 'b')
plt.xlabel('X Coordinate', labelpad = 30)
plt.ylabel('Y Coordinate',labelpad = 30)
plt.gca().set_zlabel('Z Coordinate',labelpad = 30)
plt.title("Cluster Group: 0")
plt.show()

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df1['x'],df1['y'],df1['z'], c = 'g')
plt.xlabel('X Coordinate',labelpad = 30)
plt.ylabel('Y Coordinate',labelpad = 30)
plt.gca().set_zlabel('Z Coordinate',labelpad = 30)
plt.title("Cluster Group: 1")
plt.show()

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df2['x'],df2['y'],df2['z'], c = 'r')
plt.xlabel('X Coordinate',labelpad = 300)
plt.ylabel('Y Coordinate',labelpad = 300)
plt.gca().set_zlabel('Z Coordinate',labelpad = 300)
plt.title("Cluster Group: 2")
plt.show()

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df3['x'],df3['y'],df3['z'], c = 'c')
plt.xlabel('X Coordinate',labelpad = 30)
plt.ylabel('Y Coordinate',labelpad = 30)
plt.gca().set_zlabel('Z Coordinate',labelpad = 30)
plt.title("Cluster Group: 3")
plt.show()

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df4['x'],df4['y'],df4['z'], c = 'm')
plt.xlabel('X Coordinate',labelpad = 30)
plt.ylabel('Y Coordinate',labelpad = 30)
plt.gca().set_zlabel('Z Coordinate',labelpad = 30)
plt.title("Cluster Group: 4")
plt.show()

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df5['x'],df5['y'],df5['z'], c = 'y')
plt.xlabel('X Coordinate',labelpad = 30)
plt.ylabel('Y Coordinate',labelpad = 30)
plt.gca().set_zlabel('Z Coordinate',labelpad = 30)
plt.title("Cluster Group: 5")
plt.show()

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df6['x'],df6['y'],df6['z'], c = 'w')
plt.xlabel('X Coordinate',labelpad = 30)
plt.ylabel('Y Coordinate',labelpad = 30)
plt.gca().set_zlabel('Z Coordinate',labelpad = 30)
plt.title("Cluster Group: 6")
plt.show()



In [31]:
data = open('../data/data.csv', 'r').readlines()
fieldnames = ['x', 'y', 'z', 'unmasked', 'synapses']
reader = csv.reader(data)
reader.next()

rows = [[int(col) for col in row] for row in reader]

In [32]:
sorted_x = sorted(list(set([r[0] for r in rows])))
sorted_y = sorted(list(set([r[1] for r in rows])))
sorted_z = sorted(list(set([r[2] for r in rows])))

In [64]:
data = df.as_matrix()
volume = np.zeros((len(sorted_y),len(sorted_x), len(sorted_z)))
for row in data:
    volume[sorted_y.index(row[1]), sorted_x.index(row[0]), sorted_z.index(row[2])] = row[-1]+1

#print lenvolume[0]
for ar in volume:
    fig = plt.figure()
    ax = sns.heatmap(ar, yticklabels=False, xticklabels=False)
    plt.show()



In [65]:
print volume[0][0]
print len(volume[1])
print len(volume)


[ 0.  0.  0.  0.  0.  6.  0.  0.  0.  1.  0.]
108
52

In [33]:
for i in sorted_y:
    data = df[df.y == i]
    df0 = data[data.clusterGroup == 0]
    df1 = data[data.clusterGroup == 1]
    df2 = data[data.clusterGroup == 2]
    df3 = data[data.clusterGroup == 3]
    df4 = data[data.clusterGroup == 4]
    df5 = data[data.clusterGroup == 5]
    df6 = data[data.clusterGroup == 6]
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    ax.scatter(df0['x'],df0['z'], c = 'b')
    ax.scatter(df1['x'],df1['z'], c = 'g')
    ax.scatter(df2['x'],df2['z'], c = 'r')
    ax.scatter(df3['x'],df3['z'], c = 'c')
    ax.scatter(df4['x'],df4['z'], c = 'm')
    ax.scatter(df5['x'],df5['z'], c = 'y')
    ax.scatter(df6['x'],df6['z'], c = 'w')
    
    plt.xlabel('X Coordinate', labelpad = 30)
    plt.ylabel('Z Coordinate',labelpad = 30)
    plt.title("Y Layer: " + str(i))
    plt.show()



In [ ]: